# Creates Wordscore Estimates

library(tm)
library(austin)


#################

  #1. Prepping Documents

  # Load all documents being estimated by inserting file location below
  # Switch between Labour and Conservative MPs by commenting out

  # Text files and reference documents for Labour MPs
directory <- ".../Labour Files"  

  # Text files and reference documents for Conservative MPs
#directory <- ".../Conservative Files"

  # Create corpus
textcorpus <- Corpus(DirSource(directory),readerControl = list(reader=readPlain))

  # Remove white space, make lower case, remove numbers
textcorpus <- tm_map(textcorpus, stripWhitespace)
textcorpus <- tm_map(textcorpus,content_transformer(tolower))
textcorpus <- tm_map(textcorpus,removeNumbers)

  # Remove Stopwords
textcorpus <- tm_map(textcorpus, removeWords, c("a", "also", "am", "an", "and", "are", "as", "at", "be",
                                                "because", "but", "by", "go", "going", "gone", "he", "her",
                                                "hers", "herself", "him", "himself", "his", "i", "if", "in",
                                                "is", "it", "itself", "may", "me", "my", "myself", "of",
                                                "on", "or", "our", "ours", "ourselves", "out", "perhaps",
                                                "so", "that", "the", "their", "theirs", "them", "then", 
                                                "therefore", "these", "they", "this", "those", "though", 
                                                "thus", "to", "us", "was", "we", "went", "when", "where",
                                                "whether", "which", "while", "who", "whom", "with", 
                                                "would", "you", "your", "yours", "yourself", "yourselves",
                                                "for", "its", "has", "had", "have", "s", "now", "do", 
                                                "does", "there", "yes", "no", "than", "can", "could", "re",
                                                "she","will","per","non","how","from","been","being",
                                                "irranca","hon","ion","yts","benchers","tories","finsbury",
                                                "some","such","said","say","tory","ms short","theour","ask",
                                                "any","all","point","not"))

wordmat <- TermDocumentMatrix(textcorpus)

  # Remove rare words
wordmat1 <- removeSparseTerms(wordmat, 0.97) 

  # Final set of words
wordmat1 <- as.matrix(wordmat1)


########################

  #2. Wordscores

  # Set Reference Documents, virgin documents, all documents
ref <- c(which(colnames(wordmat1)=="ref_left.txt"),which(colnames(wordmat1)=="ref_right.txt")) 
vir <- 1:length(colnames(wordmat1)) # all texts
vir <- vir[-ref]  # virgin texts only

  # Convert to the format needed for austin, a word frequency matrix
wordmatws <- as.wfm(wordmat1)

  # Score the reference documents
r <- getdocs(wordmatws, ref)
ws <- classic.wordscores(r, scores=c(-5,5))
thescores <- coef(ws)
length(thescores[which(thescores==5)])
length(thescores[which(thescores==-5)])
length(thescores)

  # Wordscore results1 for all virgin documents
v <- getdocs(wordmatws, vir)
results1 <- as.data.frame(predict(ws, newdata=v))

  # Sort out names and period dummy
names<-c(rownames(results1))
results1$names <- names

results1$firstperiod <- rep(0,length(results1$names))
for(i in 1: length(results1$names)){
  if( length(grep("8794",results1$names[i])) > 0 ){
    results1$firstperiod[i] <- 1
  }
}

results1$namesOld <- results1$names
for(i in 1:length(results1$names)){
  results1$names[i] <- gsub("_"," ",results1$names[i])
  results1$names[i] <- gsub(" 8794","",results1$names[i])
  results1$names[i] <- gsub(".txt","",results1$names[i])
}

  # standard errors and confidence intervals
colnames(results1)[2] <- "stderror"
results1$lower <- results1$Score - (results1$stderror*1.96)
results1$upper <- results1$Score + (results1$stderror*1.96)

  # final dataset of positions
results1 <- subset(results1, select = c(6,1,9,10,2,7))
save(results1,file=...)
